import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
df=pd.read_csv('Phishing_Legitimate_train_missing_data (2).csv',index_col='id',na_values=['',' ','n/a','null'])
df.head()
| NumDots | SubdomainLevel | PathLevel | UrlLength | NumDash | NumDashInHostname | AtSymbol | TildeSymbol | NumUnderscore | NumPercent | ... | InsecureForms | RelativeFormAction | ExtFormAction | AbnormalFormAction | RightClickDisabled | PopUpWindow | IframeOrFrame | MissingTitle | ImagesOnlyInForm | CLASS_LABEL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| 1 | 3.0 | 1.0 | 5.0 | 81.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 |
| 2 | 2.0 | 0.0 | 5.0 | 78.0 | 1.0 | 1.0 | 0.0 | 0.0 | 3.0 | 0.0 | ... | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 |
| 3 | 3.0 | 0.0 | 4.0 | 53.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 |
| 4 | 3.0 | 1.0 | 6.0 | 68.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 |
| 5 | 3.0 | 0.0 | 3.0 | 61.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 |
5 rows × 38 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5000 entries, 1 to 5000 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 NumDots 4999 non-null float64 1 SubdomainLevel 4999 non-null float64 2 PathLevel 4999 non-null float64 3 UrlLength 4994 non-null float64 4 NumDash 4998 non-null float64 5 NumDashInHostname 4996 non-null float64 6 AtSymbol 4997 non-null float64 7 TildeSymbol 4997 non-null float64 8 NumUnderscore 4998 non-null float64 9 NumPercent 4997 non-null float64 10 NumQueryComponents 4996 non-null float64 11 NumAmpersand 4997 non-null float64 12 NumHash 4995 non-null float64 13 NumNumericChars 4996 non-null float64 14 NoHttps 4998 non-null float64 15 RandomString 4998 non-null float64 16 IpAddress 4998 non-null float64 17 DomainInSubdomains 4998 non-null float64 18 DomainInPaths 4997 non-null float64 19 HttpsInHostname 4998 non-null float64 20 HostnameLength 4993 non-null float64 21 PathLength 4994 non-null float64 22 QueryLength 4994 non-null float64 23 DoubleSlashInPath 4995 non-null float64 24 NumSensitiveWords 4995 non-null float64 25 EmbeddedBrandName 4995 non-null float64 26 PctExtResourceUrls 4992 non-null float64 27 ExtFavicon 4995 non-null float64 28 InsecureForms 4995 non-null float64 29 RelativeFormAction 4995 non-null float64 30 ExtFormAction 4995 non-null float64 31 AbnormalFormAction 4995 non-null float64 32 RightClickDisabled 4995 non-null float64 33 PopUpWindow 4995 non-null float64 34 IframeOrFrame 4995 non-null float64 35 MissingTitle 4995 non-null float64 36 ImagesOnlyInForm 4995 non-null float64 37 CLASS_LABEL 5000 non-null int64 dtypes: float64(37), int64(1) memory usage: 1.5 MB
missing_rows = df[df.isnull().any(axis=1)]
print(missing_rows)
NumDots SubdomainLevel PathLevel UrlLength NumDash \
id
7 NaN 0.0 1.0 NaN 0.0
23 2.0 0.0 1.0 NaN 12.0
27 4.0 1.0 3.0 72.0 0.0
145 1.0 0.0 6.0 NaN 12.0
150 3.0 1.0 3.0 56.0 0.0
419 3.0 1.0 5.0 73.0 1.0
831 1.0 0.0 0.0 30.0 1.0
903 2.0 NaN NaN NaN NaN
980 2.0 0.0 3.0 52.0 1.0
1011 2.0 1.0 3.0 64.0 1.0
1015 3.0 1.0 4.0 101.0 10.0
1236 1.0 0.0 5.0 NaN 12.0
1238 2.0 1.0 2.0 36.0 1.0
1275 7.0 2.0 2.0 206.0 55.0
1313 2.0 0.0 5.0 60.0 0.0
1760 1.0 0.0 5.0 82.0 NaN
1821 2.0 0.0 5.0 70.0 1.0
2776 2.0 0.0 4.0 73.0 1.0
2777 2.0 1.0 0.0 25.0 0.0
2778 2.0 1.0 3.0 42.0 0.0
2779 2.0 0.0 7.0 86.0 0.0
2780 3.0 1.0 2.0 72.0 4.0
4178 2.0 1.0 3.0 118.0 0.0
4179 3.0 1.0 4.0 53.0 0.0
4558 4.0 1.0 4.0 70.0 1.0
4898 4.0 0.0 1.0 213.0 2.0
4912 4.0 1.0 4.0 60.0 0.0
4963 1.0 0.0 5.0 95.0 8.0
4973 2.0 0.0 1.0 62.0 0.0
4988 3.0 1.0 1.0 NaN 9.0
NumDashInHostname AtSymbol TildeSymbol NumUnderscore NumPercent \
id
7 0.0 0.0 0.0 2.0 0.0
23 0.0 0.0 0.0 0.0 0.0
27 0.0 0.0 0.0 0.0 0.0
145 0.0 0.0 0.0 0.0 0.0
150 0.0 0.0 0.0 0.0 0.0
419 0.0 0.0 0.0 0.0 0.0
831 1.0 0.0 0.0 0.0 0.0
903 NaN NaN NaN 0.0 0.0
980 NaN NaN NaN NaN NaN
1011 0.0 0.0 1.0 0.0 0.0
1015 0.0 0.0 0.0 0.0 0.0
1236 0.0 0.0 0.0 0.0 0.0
1238 0.0 0.0 0.0 0.0 0.0
1275 0.0 0.0 0.0 0.0 0.0
1313 NaN 0.0 0.0 0.0 0.0
1760 NaN NaN NaN NaN NaN
1821 0.0 0.0 0.0 0.0 0.0
2776 0.0 0.0 0.0 0.0 0.0
2777 0.0 0.0 0.0 0.0 0.0
2778 0.0 0.0 0.0 0.0 0.0
2779 0.0 0.0 0.0 0.0 0.0
2780 0.0 0.0 0.0 0.0 0.0
4178 0.0 0.0 0.0 0.0 0.0
4179 0.0 0.0 0.0 0.0 0.0
4558 0.0 0.0 0.0 0.0 0.0
4898 0.0 0.0 0.0 2.0 NaN
4912 0.0 0.0 0.0 0.0 0.0
4963 0.0 0.0 0.0 0.0 0.0
4973 0.0 0.0 0.0 0.0 0.0
4988 0.0 0.0 0.0 2.0 0.0
... InsecureForms RelativeFormAction ExtFormAction \
id ...
7 ... 1.0 0.0 0.0
23 ... 1.0 0.0 0.0
27 ... 1.0 0.0 0.0
145 ... 1.0 0.0 0.0
150 ... 1.0 0.0 0.0
419 ... 1.0 0.0 0.0
831 ... 0.0 0.0 0.0
903 ... 0.0 0.0 0.0
980 ... 1.0 0.0 0.0
1011 ... 1.0 0.0 0.0
1015 ... 1.0 0.0 0.0
1236 ... 0.0 0.0 0.0
1238 ... 1.0 0.0 0.0
1275 ... 0.0 0.0 0.0
1313 ... 1.0 0.0 0.0
1760 ... 1.0 0.0 0.0
1821 ... 1.0 0.0 0.0
2776 ... NaN NaN NaN
2777 ... NaN NaN NaN
2778 ... NaN NaN NaN
2779 ... NaN NaN NaN
2780 ... NaN NaN NaN
4178 ... 1.0 0.0 0.0
4179 ... 1.0 1.0 0.0
4558 ... 1.0 0.0 0.0
4898 ... 0.0 0.0 0.0
4912 ... 1.0 1.0 0.0
4963 ... 1.0 1.0 0.0
4973 ... 1.0 1.0 0.0
4988 ... 0.0 0.0 0.0
AbnormalFormAction RightClickDisabled PopUpWindow IframeOrFrame \
id
7 0.0 0.0 0.0 0.0
23 0.0 1.0 0.0 1.0
27 0.0 0.0 0.0 0.0
145 0.0 0.0 0.0 1.0
150 0.0 0.0 0.0 1.0
419 0.0 0.0 0.0 0.0
831 0.0 0.0 0.0 0.0
903 0.0 0.0 0.0 1.0
980 0.0 0.0 0.0 0.0
1011 0.0 0.0 0.0 0.0
1015 0.0 0.0 0.0 1.0
1236 0.0 0.0 0.0 0.0
1238 0.0 0.0 0.0 0.0
1275 0.0 0.0 0.0 0.0
1313 0.0 0.0 0.0 0.0
1760 0.0 0.0 0.0 0.0
1821 0.0 0.0 0.0 1.0
2776 NaN NaN NaN NaN
2777 NaN NaN NaN NaN
2778 NaN NaN NaN NaN
2779 NaN NaN NaN NaN
2780 NaN NaN NaN NaN
4178 0.0 0.0 0.0 0.0
4179 0.0 0.0 0.0 0.0
4558 0.0 0.0 0.0 0.0
4898 0.0 0.0 0.0 1.0
4912 0.0 0.0 0.0 0.0
4963 0.0 0.0 0.0 1.0
4973 0.0 0.0 0.0 1.0
4988 0.0 0.0 0.0 1.0
MissingTitle ImagesOnlyInForm CLASS_LABEL
id
7 0.0 0.0 0
23 0.0 0.0 0
27 0.0 0.0 1
145 0.0 0.0 0
150 0.0 0.0 1
419 0.0 0.0 1
831 0.0 0.0 0
903 0.0 0.0 0
980 0.0 0.0 1
1011 0.0 0.0 1
1015 0.0 0.0 0
1236 0.0 0.0 0
1238 0.0 0.0 1
1275 0.0 0.0 0
1313 0.0 0.0 1
1760 0.0 0.0 0
1821 0.0 0.0 1
2776 NaN NaN 1
2777 NaN NaN 0
2778 NaN NaN 0
2779 NaN NaN 0
2780 NaN NaN 0
4178 0.0 0.0 1
4179 1.0 0.0 1
4558 0.0 0.0 1
4898 0.0 0.0 0
4912 0.0 0.0 0
4963 0.0 0.0 0
4973 0.0 0.0 0
4988 0.0 0.0 0
[30 rows x 38 columns]
rows_to_drop= df[ df.isnull().sum(axis=1) > 2 ].index
print(rows_to_drop)
df.drop(rows_to_drop,inplace=True)
print(df.shape)
Int64Index([7, 903, 980, 1760, 1821, 2776, 2777, 2778, 2779, 2780], dtype='int64', name='id') (4990, 38)
missing_values = df.isna().sum()
print(missing_values)
NumDots 0 SubdomainLevel 0 PathLevel 0 UrlLength 4 NumDash 0 NumDashInHostname 1 AtSymbol 0 TildeSymbol 0 NumUnderscore 0 NumPercent 1 NumQueryComponents 1 NumAmpersand 0 NumHash 2 NumNumericChars 2 NoHttps 0 RandomString 0 IpAddress 0 DomainInSubdomains 0 DomainInPaths 1 HttpsInHostname 0 HostnameLength 5 PathLength 0 QueryLength 0 DoubleSlashInPath 0 NumSensitiveWords 0 EmbeddedBrandName 0 PctExtResourceUrls 3 ExtFavicon 0 InsecureForms 0 RelativeFormAction 0 ExtFormAction 0 AbnormalFormAction 0 RightClickDisabled 0 PopUpWindow 0 IframeOrFrame 0 MissingTitle 0 ImagesOnlyInForm 0 CLASS_LABEL 0 dtype: int64
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10)
blanks=df[['UrlLength','NumDashInHostname','NumPercent','NumQueryComponents','NumHash','NumNumericChars','DomainInPaths','HostnameLength','PctExtResourceUrls']].to_numpy()
blanks_imputed=imputer.fit_transform(blanks)
print(blanks_imputed)
df[['UrlLength','NumDashInHostname','NumPercent','NumQueryComponents','NumHash','NumNumericChars','DomainInPaths','HostnameLength','PctExtResourceUrls']]=blanks_imputed
print(df.head(50))
[[81. 1. 0. ... 1. 29.
0. ]
[78. 1. 0. ... 0. 13.
1. ]
[53. 0. 0. ... 0. 16.
1. ]
...
[33. 0. 0. ... 0. 25.
0.64705882]
[47. 0. 0. ... 0. 15.
0. ]
[37. 0. 0. ... 0. 17.
0.14285714]]
NumDots SubdomainLevel PathLevel UrlLength NumDash NumDashInHostname \
id
1 3.0 1.0 5.0 81.0 1.0 1.0
2 2.0 0.0 5.0 78.0 1.0 1.0
3 3.0 0.0 4.0 53.0 1.0 0.0
4 3.0 1.0 6.0 68.0 0.0 0.0
5 3.0 0.0 3.0 61.0 0.0 0.0
6 3.0 1.0 2.0 55.0 0.0 0.0
8 2.0 0.0 1.0 63.0 0.0 0.0
9 3.0 0.0 3.0 58.0 0.0 0.0
10 2.0 0.0 4.0 52.0 0.0 0.0
11 4.0 1.0 5.0 75.0 2.0 2.0
12 1.0 0.0 5.0 61.0 4.0 0.0
13 2.0 0.0 2.0 39.0 0.0 0.0
14 2.0 1.0 3.0 50.0 1.0 0.0
15 1.0 0.0 4.0 148.0 14.0 0.0
16 1.0 0.0 3.0 46.0 1.0 0.0
17 2.0 1.0 2.0 52.0 1.0 1.0
18 2.0 0.0 6.0 89.0 0.0 0.0
19 4.0 1.0 3.0 53.0 1.0 0.0
20 2.0 1.0 4.0 76.0 0.0 0.0
21 2.0 0.0 2.0 100.0 1.0 0.0
22 2.0 0.0 4.0 48.0 0.0 0.0
23 2.0 0.0 1.0 76.7 12.0 0.0
24 1.0 0.0 2.0 83.0 7.0 0.0
25 2.0 0.0 3.0 80.0 0.0 0.0
26 2.0 0.0 4.0 50.0 1.0 0.0
27 4.0 1.0 3.0 72.0 0.0 0.0
28 2.0 1.0 0.0 20.0 0.0 0.0
29 3.0 2.0 2.0 74.0 6.0 0.0
30 1.0 0.0 5.0 75.0 9.0 0.0
31 3.0 1.0 3.0 48.0 1.0 1.0
32 2.0 0.0 5.0 66.0 1.0 0.0
33 2.0 1.0 0.0 23.0 0.0 0.0
34 2.0 0.0 3.0 52.0 3.0 2.0
35 3.0 1.0 2.0 56.0 100.0 0.0
36 2.0 1.0 2.0 39.0 2.0 0.0
37 4.0 0.0 5.0 87.0 1.0 0.0
38 1.0 0.0 4.0 52.0 1.0 0.0
39 2.0 0.0 3.0 70.0 5.0 0.0
40 2.0 1.0 2.0 53.0 0.0 0.0
41 1.0 0.0 0.0 37.0 0.0 0.0
42 2.0 0.0 5.0 61.0 0.0 0.0
43 2.0 1.0 2.0 71.0 3.0 1.0
44 2.0 0.0 2.0 52.0 0.0 0.0
45 3.0 1.0 5.0 58.0 2.0 1.0
46 2.0 1.0 4.0 48.0 0.0 0.0
47 2.0 0.0 1.0 80.0 5.0 0.0
48 1.0 0.0 5.0 98.0 11.0 0.0
49 2.0 1.0 2.0 45.0 2.0 0.0
50 2.0 0.0 3.0 115.0 0.0 0.0
51 6.0 1.0 6.0 103.0 0.0 0.0
AtSymbol TildeSymbol NumUnderscore NumPercent ... InsecureForms \
id ...
1 0.0 0.0 1.0 0.0 ... 1.0
2 0.0 0.0 3.0 0.0 ... 1.0
3 0.0 0.0 0.0 0.0 ... 1.0
4 0.0 0.0 0.0 0.0 ... 1.0
5 0.0 0.0 0.0 0.0 ... 1.0
6 0.0 0.0 1.0 0.0 ... 0.0
8 0.0 0.0 0.0 0.0 ... 1.0
9 0.0 0.0 0.0 0.0 ... 1.0
10 0.0 0.0 0.0 0.0 ... 1.0
11 0.0 0.0 0.0 0.0 ... 1.0
12 0.0 0.0 0.0 0.0 ... 0.0
13 0.0 0.0 0.0 0.0 ... 1.0
14 0.0 0.0 0.0 0.0 ... 1.0
15 0.0 0.0 0.0 0.0 ... 1.0
16 0.0 0.0 0.0 0.0 ... 1.0
17 0.0 0.0 0.0 0.0 ... 0.0
18 0.0 0.0 1.0 0.0 ... 1.0
19 0.0 0.0 0.0 0.0 ... 1.0
20 0.0 0.0 0.0 0.0 ... 0.0
21 0.0 0.0 5.0 0.0 ... 1.0
22 0.0 0.0 0.0 0.0 ... 1.0
23 0.0 0.0 0.0 0.0 ... 1.0
24 0.0 0.0 0.0 0.0 ... 1.0
25 0.0 0.0 1.0 0.0 ... 0.0
26 0.0 0.0 0.0 0.0 ... 1.0
27 0.0 0.0 0.0 0.0 ... 1.0
28 0.0 0.0 0.0 0.0 ... 1.0
29 0.0 0.0 0.0 0.0 ... 1.0
30 0.0 0.0 0.0 0.0 ... 1.0
31 0.0 0.0 0.0 0.0 ... 1.0
32 0.0 0.0 1.0 0.0 ... 1.0
33 0.0 0.0 0.0 0.0 ... 0.0
34 0.0 0.0 0.0 0.0 ... 1.0
35 0.0 0.0 0.0 0.0 ... 1.0
36 0.0 0.0 0.0 0.0 ... 1.0
37 0.0 0.0 0.0 0.0 ... 1.0
38 0.0 0.0 0.0 0.0 ... 1.0
39 0.0 0.0 0.0 0.0 ... 1.0
40 0.0 0.0 0.0 2.0 ... 1.0
41 0.0 0.0 2.0 0.0 ... 0.0
42 0.0 0.0 0.0 0.0 ... 1.0
43 0.0 0.0 0.0 0.0 ... 1.0
44 0.0 0.0 0.0 0.0 ... 1.0
45 0.0 0.0 0.0 0.0 ... 1.0
46 0.0 0.0 0.0 0.0 ... 1.0
47 0.0 0.0 0.0 0.0 ... 1.0
48 0.0 0.0 0.0 0.0 ... 1.0
49 0.0 0.0 0.0 0.0 ... 0.0
50 0.0 0.0 0.0 1.0 ... 1.0
51 0.0 0.0 0.0 0.0 ... 1.0
RelativeFormAction ExtFormAction AbnormalFormAction RightClickDisabled \
id
1 0.0 0.0 0.0 0.0
2 1.0 0.0 0.0 0.0
3 0.0 1.0 0.0 0.0
4 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0
6 0.0 0.0 0.0 0.0
8 0.0 0.0 0.0 0.0
9 1.0 0.0 0.0 0.0
10 1.0 0.0 0.0 0.0
11 1.0 0.0 0.0 0.0
12 0.0 0.0 0.0 0.0
13 0.0 0.0 0.0 0.0
14 0.0 1.0 0.0 0.0
15 0.0 0.0 0.0 0.0
16 0.0 0.0 0.0 0.0
17 1.0 0.0 0.0 0.0
18 0.0 0.0 0.0 0.0
19 0.0 1.0 0.0 0.0
20 0.0 0.0 0.0 0.0
21 1.0 0.0 0.0 0.0
22 0.0 0.0 0.0 0.0
23 0.0 0.0 0.0 1.0
24 0.0 0.0 0.0 0.0
25 0.0 1.0 0.0 0.0
26 0.0 0.0 0.0 0.0
27 0.0 0.0 0.0 0.0
28 0.0 0.0 0.0 0.0
29 0.0 0.0 0.0 0.0
30 1.0 1.0 0.0 0.0
31 1.0 0.0 0.0 0.0
32 0.0 0.0 0.0 0.0
33 0.0 0.0 0.0 0.0
34 0.0 0.0 0.0 0.0
35 1.0 0.0 1.0 0.0
36 0.0 0.0 0.0 0.0
37 0.0 0.0 0.0 0.0
38 0.0 0.0 0.0 0.0
39 0.0 1.0 0.0 0.0
40 1.0 0.0 0.0 0.0
41 0.0 0.0 0.0 0.0
42 1.0 0.0 0.0 0.0
43 0.0 0.0 0.0 0.0
44 0.0 0.0 0.0 0.0
45 0.0 0.0 0.0 0.0
46 0.0 0.0 0.0 0.0
47 1.0 0.0 0.0 0.0
48 0.0 0.0 0.0 0.0
49 0.0 0.0 0.0 0.0
50 0.0 0.0 0.0 0.0
51 1.0 0.0 0.0 0.0
PopUpWindow IframeOrFrame MissingTitle ImagesOnlyInForm CLASS_LABEL
id
1 0.0 0.0 0.0 0.0 0
2 0.0 0.0 0.0 0.0 1
3 0.0 1.0 0.0 0.0 1
4 0.0 0.0 0.0 0.0 1
5 0.0 1.0 0.0 0.0 1
6 0.0 0.0 1.0 0.0 0
8 0.0 0.0 0.0 0.0 0
9 0.0 0.0 1.0 0.0 1
10 0.0 0.0 0.0 0.0 1
11 0.0 0.0 0.0 0.0 1
12 0.0 1.0 0.0 0.0 0
13 0.0 0.0 0.0 0.0 1
14 0.0 1.0 0.0 0.0 0
15 0.0 1.0 0.0 0.0 0
16 0.0 0.0 0.0 0.0 0
17 0.0 0.0 0.0 0.0 1
18 0.0 0.0 1.0 0.0 1
19 0.0 1.0 0.0 0.0 1
20 0.0 0.0 1.0 0.0 0
21 0.0 0.0 0.0 0.0 1
22 0.0 0.0 0.0 0.0 1
23 0.0 1.0 0.0 0.0 0
24 0.0 1.0 0.0 0.0 0
25 0.0 0.0 0.0 0.0 0
26 0.0 0.0 0.0 0.0 1
27 0.0 0.0 0.0 0.0 1
28 0.0 1.0 0.0 0.0 0
29 0.0 0.0 0.0 0.0 0
30 0.0 1.0 0.0 0.0 0
31 0.0 0.0 0.0 0.0 1
32 0.0 0.0 0.0 0.0 1
33 0.0 1.0 0.0 0.0 0
34 0.0 0.0 0.0 0.0 1
35 0.0 0.0 0.0 0.0 0
36 0.0 0.0 0.0 0.0 0
37 0.0 0.0 0.0 0.0 1
38 0.0 1.0 0.0 0.0 1
39 0.0 0.0 0.0 0.0 0
40 0.0 1.0 0.0 0.0 0
41 0.0 0.0 0.0 0.0 0
42 0.0 0.0 0.0 0.0 1
43 0.0 0.0 0.0 0.0 0
44 0.0 0.0 0.0 0.0 1
45 0.0 0.0 0.0 0.0 1
46 0.0 0.0 0.0 1.0 1
47 0.0 1.0 0.0 0.0 0
48 0.0 0.0 0.0 0.0 0
49 0.0 1.0 1.0 0.0 0
50 0.0 1.0 0.0 0.0 0
51 0.0 0.0 0.0 0.0 1
[50 rows x 38 columns]
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=20)
x = df[['NumDots','SubdomainLevel','PathLevel','UrlLength','NumDash','NumDashInHostname','NumUnderscore','NumQueryComponents','NumAmpersand','NumNumericChars','HostnameLength','PathLength','QueryLength']].to_numpy()
outlier_label=clf.fit_predict(x)
print(clf.negative_outlier_factor_)
print(clf.offset_)
print(outlier_label)
rows_to_drop= df.iloc[ clf.negative_outlier_factor_ < -1.5].index
print(rows_to_drop)
df.drop(rows_to_drop,inplace=True)
print(df.shape)
df.head
[-1.09411557 -1.09907337 -0.98927867 ... -1.22883689 -0.99270466
-1.02708277]
-1.5
[1 1 1 ... 1 1 1]
Int64Index([ 23, 35, 60, 97, 145, 150, 188, 216, 251, 258,
...
4519, 4597, 4697, 4698, 4718, 4753, 4756, 4763, 4865, 4988],
dtype='int64', name='id', length=133)
(4857, 38)
<bound method NDFrame.head of NumDots SubdomainLevel PathLevel UrlLength NumDash \
id
1 3.0 1.0 5.0 81.0 1.0
2 2.0 0.0 5.0 78.0 1.0
3 3.0 0.0 4.0 53.0 1.0
4 3.0 1.0 6.0 68.0 0.0
5 3.0 0.0 3.0 61.0 0.0
... ... ... ... ... ...
4996 3.0 1.0 1.0 67.0 3.0
4997 1.0 0.0 2.0 36.0 1.0
4998 3.0 2.0 0.0 33.0 0.0
4999 3.0 1.0 2.0 47.0 0.0
5000 1.0 0.0 2.0 37.0 0.0
NumDashInHostname AtSymbol TildeSymbol NumUnderscore NumPercent \
id
1 1.0 0.0 0.0 1.0 0.0
2 1.0 0.0 0.0 3.0 0.0
3 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ...
4996 0.0 0.0 0.0 0.0 0.0
4997 0.0 0.0 0.0 0.0 0.0
4998 0.0 0.0 0.0 0.0 0.0
4999 0.0 0.0 0.0 0.0 0.0
5000 0.0 0.0 0.0 0.0 0.0
... InsecureForms RelativeFormAction ExtFormAction \
id ...
1 ... 1.0 0.0 0.0
2 ... 1.0 1.0 0.0
3 ... 1.0 0.0 1.0
4 ... 1.0 0.0 0.0
5 ... 1.0 0.0 0.0
... ... ... ... ...
4996 ... 1.0 0.0 0.0
4997 ... 0.0 0.0 0.0
4998 ... 1.0 0.0 1.0
4999 ... 1.0 1.0 0.0
5000 ... 1.0 0.0 0.0
AbnormalFormAction RightClickDisabled PopUpWindow IframeOrFrame \
id
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 1.0
4 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 1.0
... ... ... ... ...
4996 0.0 0.0 0.0 1.0
4997 0.0 0.0 0.0 1.0
4998 0.0 0.0 0.0 1.0
4999 0.0 0.0 0.0 0.0
5000 0.0 0.0 0.0 1.0
MissingTitle ImagesOnlyInForm CLASS_LABEL
id
1 0.0 0.0 0
2 0.0 0.0 1
3 0.0 0.0 1
4 0.0 0.0 1
5 0.0 0.0 1
... ... ... ...
4996 0.0 0.0 0
4997 0.0 0.0 0
4998 0.0 0.0 0
4999 1.0 0.0 1
5000 0.0 0.0 1
[4857 rows x 38 columns]>
df=pd.get_dummies(df, columns=['AtSymbol','TildeSymbol','NoHttps', 'RandomString', 'IpAddress', 'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname', 'DoubleSlashInPath', 'ExtFavicon', 'InsecureForms', 'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction', 'RightClickDisabled', 'PopUpWindow', 'IframeOrFrame', 'MissingTitle'],drop_first=True)
print(df)
NumDots SubdomainLevel PathLevel UrlLength NumDash \
id
1 3.0 1.0 5.0 81.0 1.0
2 2.0 0.0 5.0 78.0 1.0
3 3.0 0.0 4.0 53.0 1.0
4 3.0 1.0 6.0 68.0 0.0
5 3.0 0.0 3.0 61.0 0.0
... ... ... ... ... ...
4996 3.0 1.0 1.0 67.0 3.0
4997 1.0 0.0 2.0 36.0 1.0
4998 3.0 2.0 0.0 33.0 0.0
4999 3.0 1.0 2.0 47.0 0.0
5000 1.0 0.0 2.0 37.0 0.0
NumDashInHostname NumUnderscore NumPercent NumQueryComponents \
id
1 1.0 1.0 0.0 0.0
2 1.0 3.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0
... ... ... ... ...
4996 0.0 0.0 0.0 0.0
4997 0.0 0.0 0.0 0.0
4998 0.0 0.0 0.0 0.0
4999 0.0 0.0 0.0 0.0
5000 0.0 0.0 0.0 0.0
NumAmpersand ... DoubleSlashInPath_1.0 ExtFavicon_1.0 \
id ...
1 0.0 ... 0 0
2 0.0 ... 0 1
3 0.0 ... 0 0
4 0.0 ... 0 1
5 0.0 ... 0 1
... ... ... ... ...
4996 0.0 ... 0 0
4997 0.0 ... 0 0
4998 0.0 ... 0 0
4999 0.0 ... 0 0
5000 0.0 ... 0 1
InsecureForms_1.0 RelativeFormAction_1.0 ExtFormAction_1.0 \
id
1 1 0 0
2 1 1 0
3 1 0 1
4 1 0 0
5 1 0 0
... ... ... ...
4996 1 0 0
4997 0 0 0
4998 1 0 1
4999 1 1 0
5000 1 0 0
AbnormalFormAction_1.0 RightClickDisabled_1.0 PopUpWindow_1.0 \
id
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
5 0 0 0
... ... ... ...
4996 0 0 0
4997 0 0 0
4998 0 0 0
4999 0 0 0
5000 0 0 0
IframeOrFrame_1.0 MissingTitle_1.0
id
1 0 0
2 0 0
3 1 0
4 0 0
5 1 0
... ... ...
4996 1 0
4997 1 0
4998 1 0
4999 0 1
5000 1 0
[4857 rows x 38 columns]
df['URLCharacteristics']=np.mean(df[['NumDots','UrlLength','NumDash','NumDashInHostname','NumUnderscore','NumQueryComponents','NumAmpersand','NumNumericChars','PathLength','QueryLength']],axis=1)
max_URLCharacteristics=df['URLCharacteristics'].max()
min_URLCharacteristics=df['URLCharacteristics'].min()
df['URLCharacteristics']=(df['URLCharacteristics']-min_URLCharacteristics)/(max_URLCharacteristics-min_URLCharacteristics)
print(df['URLCharacteristics'])
id
1 0.231330
2 0.233151
3 0.123862
4 0.165756
5 0.149362
...
4996 0.162113
4997 0.065574
4998 0.032787
4999 0.102004
5000 0.058288
Name: URLCharacteristics, Length: 4857, dtype: float64
columns_to_plot_URL=['CLASS_LABEL', 'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname',
'AtSymbol', 'TildeSymbol', 'NumUnderscore', 'NumPercent', 'NumQueryComponents',
'NumAmpersand', 'NumHash', 'NumNumericChars']
g=sns.PairGrid(df[columns_to_plot_URL], hue='CLASS_LABEL')
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
<seaborn.axisgrid.PairGrid at 0x7b72ea229420>
correlation_matrix = df[columns_to_plot_URL].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
Text(0.5, 1.0, 'Correlation Heatmap')
columns_to_plot_DomainHostname=['CLASS_LABEL', 'NoHttps', 'RandomString', 'IpAddress', 'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname', 'HostnameLength']
g=sns.PairGrid(df[columns_to_plot_DomainHostname], hue='CLASS_LABEL')
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
<seaborn.axisgrid.PairGrid at 0x7b73037088e0>
correlation_matrix = df[columns_to_plot_DomainHostname].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
Text(0.5, 1.0, 'Correlation Heatmap')
columns_to_plot_path=['CLASS_LABEL', 'PathLength', 'QueryLength', 'DoubleSlashInPath']
g=sns.PairGrid(df[columns_to_plot_path], hue='CLASS_LABEL')
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
<seaborn.axisgrid.PairGrid at 0x7b72ebc3a170>
correlation_matrix = df[columns_to_plot_path].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
Text(0.5, 1.0, 'Correlation Heatmap')
columns_to_plot_FormAction=['CLASS_LABEL', 'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',
'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction', 'RightClickDisabled', 'PopUpWindow',
'IframeOrFrame', 'MissingTitle', 'ImagesOnlyInForm']
g=sns.PairGrid(df[columns_to_plot_FormAction], hue='CLASS_LABEL')
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
<seaborn.axisgrid.PairGrid at 0x7b72fee0c460>
correlation_matrix = df[columns_to_plot_FormAction].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
Text(0.5, 1.0, 'Correlation Heatmap')
columns_to_select_URL=['NumDots', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname',
'AtSymbol', 'TildeSymbol', 'NumUnderscore', 'NumPercent',
'NumNumericChars']
rfe_selector = RFE(estimator=LogisticRegression(),n_features_to_select = 3, step = 1)
rfe_selector.fit(df[columns_to_select_URL], df['CLASS_LABEL'])
print(rfe_selector.get_support())
df[columns_to_select_URL].columns[ rfe_selector.get_support() ]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-24-90b218493e60> in <cell line: 5>() 3 'NumNumericChars'] 4 ----> 5 rfe_selector = RFE(estimator=LogisticRegression(),n_features_to_select = 3, step = 1) 6 rfe_selector.fit(df[columns_to_select_URL], df['CLASS_LABEL']) 7 print(rfe_selector.get_support()) NameError: name 'RFE' is not defined
columns_to_select_DomainHostname=['NoHttps', 'RandomString', 'IpAddress', 'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname']
rfe_selector = RFE(estimator=LogisticRegression(),n_features_to_select = 3, step = 1)
rfe_selector.fit(df[columns_to_select_DomainHostname], df['CLASS_LABEL'])
print(rfe_selector.get_support())
df[columns_to_select_DomainHostname].columns[ rfe_selector.get_support() ]
[ True False True True False False]
Index(['NoHttps', 'IpAddress', 'DomainInSubdomains'], dtype='object')
columns_to_select_path=['PathLength', 'QueryLength', 'DoubleSlashInPath']
rfe_selector = RFE(estimator=LogisticRegression(),n_features_to_select = 1, step = 1)
rfe_selector.fit(df[columns_to_select_path], df['CLASS_LABEL'])
print(rfe_selector.get_support())
df[columns_to_select_path].columns[ rfe_selector.get_support() ]
[False False True]
Index(['DoubleSlashInPath'], dtype='object')
columns_to_select_FormAction=['NumSensitiveWords', 'EmbeddedBrandName', 'PctExtResourceUrls', 'InsecureForms',
'ExtFormAction', 'AbnormalFormAction', 'RightClickDisabled', 'PopUpWindow',
'IframeOrFrame', 'MissingTitle', 'ImagesOnlyInForm']
rfe_selector = RFE(estimator=LogisticRegression(),n_features_to_select = 3, step = 1)
rfe_selector.fit(df[columns_to_select_FormAction], df['CLASS_LABEL'])
print(rfe_selector.get_support())
df[columns_to_select_FormAction].columns[ rfe_selector.get_support() ]
[False False False True False False False True False True False]
Index(['InsecureForms', 'PopUpWindow', 'MissingTitle'], dtype='object')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
X=df.drop("CLASS_LABEL", axis=1)
Y=df["CLASS_LABEL"]
clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(X, Y)
Y_predicted=clf.predict(X)
plt.figure(figsize=(14, 7))
tree.plot_tree(clf.fit(X,Y),filled=True,)
feature_importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df[feature_importance_df['Importance'] != 0]
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(feature_importance_df)
Feature Importances:
Feature Importance
4 NumDash 0.419380
28 InsecureForms 0.234885
10 NumQueryComponents 0.191280
5 NumDashInHostname 0.079934
14 NoHttps 0.038053
26 PctExtResourceUrls 0.032394
0 NumDots 0.004073
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y, Y_predicted, labels=clf.classes_)
disp=ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=clf.classes_)
disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7c269b116770>
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
ac=accuracy_score(Y, Y_predicted)
print(ac)
pre=precision_score(Y, Y_predicted,average=None)
print(pre)
recall=recall_score(Y, Y_predicted,average=None)
print(recall)
f1 = f1_score(Y, Y_predicted, average=None)
print(f1)
0.8060531192093885 [0.85755396 0.76731602] [0.73489519 0.87747525] [0.79150066 0.8187067 ]
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
print(min_samples_splits)
max_depths = [None, 5, 10, 20]
print(max_depths)
tuned_parameters = [{'min_samples_split': min_samples_splits, 'max_depth': max_depths}]
base_model = DecisionTreeClassifier()
clf = GridSearchCV(estimator=base_model, param_grid=tuned_parameters, cv=5, verbose=3, scoring='roc_auc')
clf.fit(X, Y)
print("Grid Search Results:")
print(clf.cv_results_)
best_params = clf.best_params_
print("Best Parameters:", best_params)
mean_test_scores = clf.cv_results_['mean_test_score']
print("Mean Test Scores:", mean_test_scores)
best_model = clf.best_estimator_
y_pred_proba = best_model.predict_proba(X)[:, 1]
best_model_auc = roc_auc_score(Y, y_pred_proba)
print("AUC for the Best Model:", best_model_auc)
[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
[None, 5, 10, 20]
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END max_depth=None, min_samples_split=0.1;, score=0.937 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.1;, score=0.947 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.1;, score=0.935 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.1;, score=0.926 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.1;, score=0.937 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=0.2;, score=0.916 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.2;, score=0.914 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.2;, score=0.897 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.2;, score=0.881 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.2;, score=0.895 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=0.30000000000000004;, score=0.849 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.30000000000000004;, score=0.880 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.30000000000000004;, score=0.876 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.30000000000000004;, score=0.849 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.30000000000000004;, score=0.870 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=0.4;, score=0.828 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.4;, score=0.852 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.4;, score=0.850 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.4;, score=0.821 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.4;, score=0.846 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=0.5;, score=0.828 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.5;, score=0.852 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.5;, score=0.850 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.5;, score=0.821 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.5;, score=0.846 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=0.6;, score=0.797 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.6;, score=0.813 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.6;, score=0.814 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.6;, score=0.782 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.6;, score=0.814 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=0.7000000000000001;, score=0.750 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.7000000000000001;, score=0.753 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.7000000000000001;, score=0.745 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.7000000000000001;, score=0.728 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.7000000000000001;, score=0.752 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=0.8;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.8;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.8;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.8;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.8;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=0.9;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=0.9;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=0.9;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=0.9;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=0.9;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=None, min_samples_split=1.0;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=None, min_samples_split=1.0;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=None, min_samples_split=1.0;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=None, min_samples_split=1.0;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=None, min_samples_split=1.0;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.1;, score=0.909 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.1;, score=0.926 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.1;, score=0.906 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.1;, score=0.882 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.1;, score=0.901 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.2;, score=0.901 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.2;, score=0.904 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.2;, score=0.887 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.2;, score=0.872 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.2;, score=0.889 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.30000000000000004;, score=0.849 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.30000000000000004;, score=0.880 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.30000000000000004;, score=0.876 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.30000000000000004;, score=0.849 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.30000000000000004;, score=0.870 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.4;, score=0.828 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.4;, score=0.852 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.4;, score=0.850 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.4;, score=0.821 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.4;, score=0.846 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.5;, score=0.828 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.5;, score=0.852 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.5;, score=0.850 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.5;, score=0.821 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.5;, score=0.846 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.6;, score=0.797 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.6;, score=0.813 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.6;, score=0.814 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.6;, score=0.782 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.6;, score=0.814 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.7000000000000001;, score=0.750 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.7000000000000001;, score=0.753 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.7000000000000001;, score=0.745 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.7000000000000001;, score=0.728 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.7000000000000001;, score=0.752 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.8;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.8;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.8;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.8;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.8;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=0.9;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=0.9;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=0.9;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=0.9;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=0.9;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=5, min_samples_split=1.0;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=5, min_samples_split=1.0;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=5, min_samples_split=1.0;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=5, min_samples_split=1.0;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=5, min_samples_split=1.0;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.1;, score=0.936 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.1;, score=0.947 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.1;, score=0.935 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.1;, score=0.925 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.1;, score=0.937 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.2;, score=0.916 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.2;, score=0.914 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.2;, score=0.897 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.2;, score=0.881 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.2;, score=0.895 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.30000000000000004;, score=0.849 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.30000000000000004;, score=0.880 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.30000000000000004;, score=0.876 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.30000000000000004;, score=0.849 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.30000000000000004;, score=0.870 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.4;, score=0.828 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.4;, score=0.852 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.4;, score=0.850 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.4;, score=0.821 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.4;, score=0.846 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.5;, score=0.828 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.5;, score=0.852 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.5;, score=0.850 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.5;, score=0.821 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.5;, score=0.846 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.6;, score=0.797 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.6;, score=0.813 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.6;, score=0.814 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.6;, score=0.782 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.6;, score=0.814 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.7000000000000001;, score=0.750 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.7000000000000001;, score=0.753 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.7000000000000001;, score=0.745 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.7000000000000001;, score=0.728 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.7000000000000001;, score=0.752 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.8;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.8;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.8;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.8;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.8;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=0.9;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=0.9;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=0.9;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=0.9;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=0.9;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=10, min_samples_split=1.0;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=10, min_samples_split=1.0;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=10, min_samples_split=1.0;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=10, min_samples_split=1.0;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=10, min_samples_split=1.0;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.1;, score=0.937 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.1;, score=0.947 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.1;, score=0.935 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.1;, score=0.926 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.1;, score=0.937 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.2;, score=0.916 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.2;, score=0.914 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.2;, score=0.897 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.2;, score=0.881 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.2;, score=0.895 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.30000000000000004;, score=0.849 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.30000000000000004;, score=0.880 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.30000000000000004;, score=0.876 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.30000000000000004;, score=0.849 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.30000000000000004;, score=0.870 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.4;, score=0.828 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.4;, score=0.852 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.4;, score=0.850 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.4;, score=0.821 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.4;, score=0.846 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.5;, score=0.828 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.5;, score=0.852 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.5;, score=0.850 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.5;, score=0.821 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.5;, score=0.846 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.6;, score=0.797 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.6;, score=0.813 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.6;, score=0.814 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.6;, score=0.782 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.6;, score=0.814 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.7000000000000001;, score=0.750 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.7000000000000001;, score=0.753 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.7000000000000001;, score=0.745 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.7000000000000001;, score=0.728 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.7000000000000001;, score=0.752 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.8;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.8;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.8;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.8;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.8;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=0.9;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=0.9;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=0.9;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=0.9;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=0.9;, score=0.677 total time= 0.0s
[CV 1/5] END max_depth=20, min_samples_split=1.0;, score=0.692 total time= 0.0s
[CV 2/5] END max_depth=20, min_samples_split=1.0;, score=0.684 total time= 0.0s
[CV 3/5] END max_depth=20, min_samples_split=1.0;, score=0.670 total time= 0.0s
[CV 4/5] END max_depth=20, min_samples_split=1.0;, score=0.654 total time= 0.0s
[CV 5/5] END max_depth=20, min_samples_split=1.0;, score=0.677 total time= 0.0s
Grid Search Results:
{'mean_fit_time': array([0.01793795, 0.01365795, 0.01034584, 0.00998201, 0.00968375,
0.00919371, 0.00805531, 0.00647502, 0.0070219 , 0.00626655,
0.01232347, 0.0136539 , 0.01040902, 0.01022491, 0.0096272 ,
0.00876679, 0.00722232, 0.00549202, 0.00660048, 0.00576396,
0.01803284, 0.01193585, 0.0098424 , 0.00931005, 0.01038899,
0.01052971, 0.007619 , 0.00622945, 0.00554309, 0.00584545,
0.01788516, 0.01276417, 0.01033468, 0.00934911, 0.01064281,
0.00888958, 0.00868831, 0.00622072, 0.00687342, 0.00634537]), 'std_fit_time': array([3.62540687e-04, 1.71024030e-03, 1.01081223e-04, 1.11673406e-03,
1.27170808e-03, 7.63725939e-04, 1.49305824e-04, 1.99452747e-04,
1.54245421e-03, 1.87524990e-04, 2.46636506e-04, 2.95196485e-03,
9.58808668e-05, 1.32373407e-03, 9.73743738e-04, 1.41903992e-03,
1.17829965e-03, 5.58801260e-04, 2.13676575e-03, 1.06090578e-04,
1.51620185e-03, 7.95138087e-04, 5.26545087e-04, 9.60911700e-04,
1.98275636e-03, 1.47889253e-03, 1.13491411e-04, 1.17696093e-03,
3.73297859e-04, 4.25120226e-04, 1.66347640e-03, 1.45052742e-03,
5.99887647e-04, 4.91291528e-04, 1.82251867e-03, 1.04767953e-03,
1.41683714e-03, 2.39986301e-04, 2.06297308e-03, 1.44355124e-04]), 'mean_score_time': array([0.00425887, 0.00460052, 0.00396752, 0.0041244 , 0.00371642,
0.00424824, 0.00365424, 0.00349264, 0.00338583, 0.00340829,
0.00385275, 0.0056107 , 0.00375853, 0.00411191, 0.00398917,
0.00373616, 0.00342808, 0.00321245, 0.00329723, 0.00325661,
0.00421238, 0.00350823, 0.00349979, 0.00410066, 0.00451164,
0.00417023, 0.00349693, 0.00371151, 0.00357375, 0.00414972,
0.00407476, 0.0041286 , 0.00368938, 0.00346031, 0.00374012,
0.00433202, 0.00398989, 0.00332532, 0.00347228, 0.0040482 ]), 'std_score_time': array([1.01988658e-04, 1.11765744e-03, 7.85852983e-04, 5.77430918e-04,
2.74236317e-04, 8.97871689e-04, 1.14345312e-04, 2.60880542e-04,
8.86057109e-05, 1.10005888e-04, 1.75618200e-04, 2.24679730e-03,
1.19480454e-04, 6.35661323e-04, 9.31621278e-04, 8.36535971e-04,
1.44275244e-04, 1.07882128e-04, 1.82658920e-04, 2.14657419e-04,
7.08532701e-04, 1.22344986e-04, 1.21192120e-04, 1.03854251e-03,
1.28388397e-03, 6.96487628e-04, 9.01197504e-05, 8.37110432e-04,
4.14038026e-04, 1.51447250e-03, 9.25956374e-05, 6.65205244e-04,
1.85844650e-04, 6.95983976e-05, 2.17614520e-04, 1.60505438e-03,
8.79053710e-04, 1.27217401e-04, 3.00229201e-04, 1.15972513e-03]), 'param_max_depth': masked_array(data=[None, None, None, None, None, None, None, None, None,
None, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object), 'param_min_samples_split': masked_array(data=[0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6,
0.7000000000000001, 0.8, 0.9, 1.0, 0.1, 0.2,
0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001,
0.8, 0.9, 1.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5,
0.6, 0.7000000000000001, 0.8, 0.9, 1.0, 0.1, 0.2,
0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001,
0.8, 0.9, 1.0],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object), 'params': [{'max_depth': None, 'min_samples_split': 0.1}, {'max_depth': None, 'min_samples_split': 0.2}, {'max_depth': None, 'min_samples_split': 0.30000000000000004}, {'max_depth': None, 'min_samples_split': 0.4}, {'max_depth': None, 'min_samples_split': 0.5}, {'max_depth': None, 'min_samples_split': 0.6}, {'max_depth': None, 'min_samples_split': 0.7000000000000001}, {'max_depth': None, 'min_samples_split': 0.8}, {'max_depth': None, 'min_samples_split': 0.9}, {'max_depth': None, 'min_samples_split': 1.0}, {'max_depth': 5, 'min_samples_split': 0.1}, {'max_depth': 5, 'min_samples_split': 0.2}, {'max_depth': 5, 'min_samples_split': 0.30000000000000004}, {'max_depth': 5, 'min_samples_split': 0.4}, {'max_depth': 5, 'min_samples_split': 0.5}, {'max_depth': 5, 'min_samples_split': 0.6}, {'max_depth': 5, 'min_samples_split': 0.7000000000000001}, {'max_depth': 5, 'min_samples_split': 0.8}, {'max_depth': 5, 'min_samples_split': 0.9}, {'max_depth': 5, 'min_samples_split': 1.0}, {'max_depth': 10, 'min_samples_split': 0.1}, {'max_depth': 10, 'min_samples_split': 0.2}, {'max_depth': 10, 'min_samples_split': 0.30000000000000004}, {'max_depth': 10, 'min_samples_split': 0.4}, {'max_depth': 10, 'min_samples_split': 0.5}, {'max_depth': 10, 'min_samples_split': 0.6}, {'max_depth': 10, 'min_samples_split': 0.7000000000000001}, {'max_depth': 10, 'min_samples_split': 0.8}, {'max_depth': 10, 'min_samples_split': 0.9}, {'max_depth': 10, 'min_samples_split': 1.0}, {'max_depth': 20, 'min_samples_split': 0.1}, {'max_depth': 20, 'min_samples_split': 0.2}, {'max_depth': 20, 'min_samples_split': 0.30000000000000004}, {'max_depth': 20, 'min_samples_split': 0.4}, {'max_depth': 20, 'min_samples_split': 0.5}, {'max_depth': 20, 'min_samples_split': 0.6}, {'max_depth': 20, 'min_samples_split': 0.7000000000000001}, {'max_depth': 20, 'min_samples_split': 0.8}, {'max_depth': 20, 'min_samples_split': 0.9}, {'max_depth': 20, 'min_samples_split': 1.0}], 'split0_test_score': array([0.93730181, 0.9160884 , 0.84869493, 0.82794936, 0.82794936,
0.79748725, 0.75029107, 0.69175893, 0.69175893, 0.69175893,
0.9089714 , 0.90057156, 0.84869493, 0.82794936, 0.82794936,
0.79748725, 0.75029107, 0.69175893, 0.69175893, 0.69175893,
0.93634074, 0.9160884 , 0.84869493, 0.82794936, 0.82794936,
0.79748725, 0.75029107, 0.69175893, 0.69175893, 0.69175893,
0.93730181, 0.9160884 , 0.84869493, 0.82794936, 0.82794936,
0.79748725, 0.75029107, 0.69175893, 0.69175893, 0.69175893]), 'split1_test_score': array([0.94746502, 0.91443934, 0.88002498, 0.8516226 , 0.8516226 ,
0.81261881, 0.75315735, 0.68368509, 0.68368509, 0.68368509,
0.92617541, 0.90427189, 0.88002498, 0.8516226 , 0.8516226 ,
0.81261881, 0.75315735, 0.68368509, 0.68368509, 0.68368509,
0.94746502, 0.91443934, 0.88002498, 0.8516226 , 0.8516226 ,
0.81261881, 0.75315735, 0.68368509, 0.68368509, 0.68368509,
0.94746502, 0.91443934, 0.88002498, 0.8516226 , 0.8516226 ,
0.81261881, 0.75315735, 0.68368509, 0.68368509, 0.68368509]), 'split2_test_score': array([0.93467129, 0.89741545, 0.8758676 , 0.84978872, 0.84978872,
0.81408353, 0.74532048, 0.67030394, 0.67030394, 0.67030394,
0.9056396 , 0.88703396, 0.8758676 , 0.84978872, 0.84978872,
0.81408353, 0.74532048, 0.67030394, 0.67030394, 0.67030394,
0.93467129, 0.89741545, 0.8758676 , 0.84978872, 0.84978872,
0.81408353, 0.74532048, 0.67030394, 0.67030394, 0.67030394,
0.93467129, 0.89741545, 0.8758676 , 0.84978872, 0.84978872,
0.81408353, 0.74532048, 0.67030394, 0.67030394, 0.67030394]), 'split3_test_score': array([0.92551228, 0.88086844, 0.84915574, 0.8209622 , 0.8209622 ,
0.781683 , 0.72827203, 0.65426159, 0.65426159, 0.65426159,
0.88207967, 0.87152009, 0.84915574, 0.8209622 , 0.8209622 ,
0.781683 , 0.72827203, 0.65426159, 0.65426159, 0.65426159,
0.92505621, 0.88086844, 0.84915574, 0.8209622 , 0.8209622 ,
0.781683 , 0.72827203, 0.65426159, 0.65426159, 0.65426159,
0.92551228, 0.88086844, 0.84915574, 0.8209622 , 0.8209622 ,
0.781683 , 0.72827203, 0.65426159, 0.65426159, 0.65426159]), 'split4_test_score': array([0.93748886, 0.89481142, 0.86989733, 0.84612023, 0.84612023,
0.81373086, 0.75236095, 0.67690594, 0.67690594, 0.67690594,
0.90133215, 0.88854737, 0.86989733, 0.84612023, 0.84612023,
0.81373086, 0.75236095, 0.67690594, 0.67690594, 0.67690594,
0.93747189, 0.89481142, 0.86989733, 0.84612023, 0.84612023,
0.81373086, 0.75236095, 0.67690594, 0.67690594, 0.67690594,
0.93748886, 0.89481142, 0.86989733, 0.84612023, 0.84612023,
0.81373086, 0.75236095, 0.67690594, 0.67690594, 0.67690594]), 'mean_test_score': array([0.93648785, 0.90072461, 0.86472812, 0.83928862, 0.83928862,
0.80392069, 0.74588038, 0.6753831 , 0.6753831 , 0.6753831 ,
0.90483965, 0.89038897, 0.86472812, 0.83928862, 0.83928862,
0.80392069, 0.74588038, 0.6753831 , 0.6753831 , 0.6753831 ,
0.93620103, 0.90072461, 0.86472812, 0.83928862, 0.83928862,
0.80392069, 0.74588038, 0.6753831 , 0.6753831 , 0.6753831 ,
0.93648785, 0.90072461, 0.86472812, 0.83928862, 0.83928862,
0.80392069, 0.74588038, 0.6753831 , 0.6753831 , 0.6753831 ]), 'std_test_score': array([0.00701321, 0.0131478 , 0.01329936, 0.01243774, 0.01243774,
0.01273644, 0.00921709, 0.01273833, 0.01273833, 0.01273833,
0.01416506, 0.01155078, 0.01329936, 0.01243774, 0.01243774,
0.01273644, 0.00921709, 0.01273833, 0.01273833, 0.01273833,
0.00714226, 0.0131478 , 0.01329936, 0.01243774, 0.01243774,
0.01273644, 0.00921709, 0.01273833, 0.01273833, 0.01273833,
0.00701321, 0.0131478 , 0.01329936, 0.01243774, 0.01243774,
0.01273644, 0.00921709, 0.01273833, 0.01273833, 0.01273833]), 'rank_test_score': array([ 1, 5, 9, 13, 13, 21, 25, 29, 29, 29, 4, 8, 9, 13, 13, 21, 25,
29, 29, 29, 3, 5, 9, 13, 13, 21, 25, 29, 29, 29, 1, 5, 9, 13,
13, 21, 25, 29, 29, 29], dtype=int32)}
Best Parameters: {'max_depth': None, 'min_samples_split': 0.1}
Mean Test Scores: [0.93648785 0.90072461 0.86472812 0.83928862 0.83928862 0.80392069
0.74588038 0.6753831 0.6753831 0.6753831 0.90483965 0.89038897
0.86472812 0.83928862 0.83928862 0.80392069 0.74588038 0.6753831
0.6753831 0.6753831 0.93620103 0.90072461 0.86472812 0.83928862
0.83928862 0.80392069 0.74588038 0.6753831 0.6753831 0.6753831
0.93648785 0.90072461 0.86472812 0.83928862 0.83928862 0.80392069
0.74588038 0.6753831 0.6753831 0.6753831 ]
AUC for the Best Model: 0.9459130947003455
from sklearn.model_selection import cross_val_score
results = []
for max_depth in max_depths:
for min_samples_split in min_samples_splits:
model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
auc_scores = cross_val_score(model, X, Y, cv=5, scoring='roc_auc')
recall_scores = cross_val_score(model, X, Y, cv=5, scoring='recall')
precision_scores = cross_val_score(model, X, Y, cv=5, scoring='precision')
f1_scores = cross_val_score(model, X, Y, cv=5, scoring='f1')
results.append({
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'AUC': np.mean(auc_scores),
'Recall': np.mean(recall_scores),
'Precision': np.mean(precision_scores),
'F1': np.mean(f1_scores)
})
df_results = pd.DataFrame(results)
print(df_results)
max_depth min_samples_split AUC Recall Precision F1 0 NaN 0.1 0.936488 0.870050 0.856195 0.862832 1 NaN 0.2 0.900725 0.708746 0.910136 0.795916 2 NaN 0.3 0.864728 0.830870 0.765836 0.796820 3 NaN 0.4 0.839289 0.830870 0.765836 0.796820 4 NaN 0.5 0.839289 0.830870 0.765836 0.796820 5 NaN 0.6 0.803921 0.830870 0.765836 0.796820 6 NaN 0.7 0.745880 0.893169 0.685405 0.775287 7 NaN 0.8 0.675383 0.938952 0.614532 0.742469 8 NaN 0.9 0.675383 0.938952 0.614532 0.742469 9 NaN 1.0 0.675383 0.938952 0.614532 0.742469 10 5.0 0.1 0.904836 0.937714 0.747749 0.831536 11 5.0 0.2 0.890389 0.870878 0.759751 0.811409 12 5.0 0.3 0.864728 0.830870 0.765836 0.796820 13 5.0 0.4 0.839289 0.830870 0.765836 0.796820 14 5.0 0.5 0.839289 0.830870 0.765836 0.796820 15 5.0 0.6 0.803921 0.830870 0.765836 0.796820 16 5.0 0.7 0.745880 0.893169 0.685405 0.775287 17 5.0 0.8 0.675383 0.938952 0.614532 0.742469 18 5.0 0.9 0.675383 0.938952 0.614532 0.742469 19 5.0 1.0 0.675383 0.938952 0.614532 0.742469 20 10.0 0.1 0.936201 0.870050 0.856195 0.862832 21 10.0 0.2 0.900725 0.708746 0.910136 0.795916 22 10.0 0.3 0.864728 0.830870 0.765836 0.796820 23 10.0 0.4 0.839289 0.830870 0.765836 0.796820 24 10.0 0.5 0.839289 0.830870 0.765836 0.796820 25 10.0 0.6 0.803921 0.830870 0.765836 0.796820 26 10.0 0.7 0.745880 0.893169 0.685405 0.775287 27 10.0 0.8 0.675383 0.938952 0.614532 0.742469 28 10.0 0.9 0.675383 0.938952 0.614532 0.742469 29 10.0 1.0 0.675383 0.938952 0.614532 0.742469 30 20.0 0.1 0.936488 0.870050 0.856195 0.862832 31 20.0 0.2 0.900725 0.708746 0.910136 0.795916 32 20.0 0.3 0.864728 0.830870 0.765836 0.796820 33 20.0 0.4 0.839289 0.830870 0.765836 0.796820 34 20.0 0.5 0.839289 0.830870 0.765836 0.796820 35 20.0 0.6 0.803921 0.830870 0.765836 0.796820 36 20.0 0.7 0.745880 0.893169 0.685405 0.775287 37 20.0 0.8 0.675383 0.938952 0.614532 0.742469 38 20.0 0.9 0.675383 0.938952 0.614532 0.742469 39 20.0 1.0 0.675383 0.938952 0.614532 0.742469
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_model = DecisionTreeClassifier(max_depth=best_max_depth, min_samples_split=best_min_samples_split)
best_model.fit(X, Y)
y_pred_proba = best_model.predict_proba(X)[:, 1]
fpr, tpr, _ = roc_curve(Y, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()
Y=df[["CLASS_LABEL"]]
X=df[["InsecureForms","NumQueryComponents","NumDashInHostname","NoHttps","PctExtResourceUrls","NumDots","IpAddress"]]
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=1)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
kf = KFold(n_splits=5, random_state=None, shuffle=True)
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
print(min_samples_splits)
avg_f1_test = []
avg_f1_train = []
avg_n_leaves = []
for mss in min_samples_splits:
f1_train = []
f1_test = []
n_leaves = []
for train_index, test_index in kf.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
clf = tree.DecisionTreeClassifier(min_samples_split=mss)
clf = clf.fit(X_train, Y_train)
Y_test_predicted = clf.predict(X_test)
Y_train_predicted = clf.predict(X_train)
f1_test.append(f1_score(Y_test, Y_test_predicted, pos_label=0))
f1_train.append(f1_score(Y_train, Y_train_predicted, pos_label=0))
n_leaves.append(clf.get_n_leaves())
avg_f1_test.append(np.mean(f1_test))
avg_f1_train.append(np.mean(f1_train))
avg_n_leaves.append(np.mean(n_leaves))
[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
plt.figure(figsize=(4,4))
plt.plot(min_samples_splits,avg_f1_test,label='Testing Set')
plt.plot(min_samples_splits,avg_f1_train,label='Training Set')
plt.legend()
plt.xticks(min_samples_splits)
plt.grid(color='b', axis='x', linestyle='-.', linewidth=1,alpha=0.2)
plt.xlabel('Minimum Sample Split Fraction')
plt.ylabel('F1')
Text(0, 0.5, 'F1')
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
clf = tree.DecisionTreeClassifier(max_depth=None, min_samples_split=0.5)
clf.fit(X, Y)
Y_prob = clf.predict_proba(X)[:, 1]
fpr, tpr, thresholds = roc_curve(Y, Y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
accuracy = accuracy_score(Y, Y_pred)
precision = precision_score(Y, Y_pred)
recall = recall_score(Y, Y_pred)
f1 = f1_score(Y, Y_pred)
auc = roc_auc_score(Y, Y_prob)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
Accuracy: 0.8163 Precision: 0.8260 Recall: 0.8007 F1 Score: 0.8132 AUC: 0.8005
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, roc_auc_score
param_grid = {'C': [0.1, 1, 10, 20, 30]}
svm_classifier = SVC(kernel='rbf')
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring="roc_auc")
grid_search.fit(X_train, Y_train)
print("Best C:", grid_search.best_params_['C'])
best_svm_classifier = SVC(C=grid_search.best_params_['C'], kernel='rbf')
best_svm_classifier.fit(X_train, Y_train)
Y_pred = best_svm_classifier.predict(X)
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
Best C: 20
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
clf = svm.SVC(C=20,kernel='rbf')
clf.fit(X, Y)
Y_pre=clf.predict(X)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
accuracy = accuracy_score(Y, Y_pred)
precision = precision_score(Y, Y_pred)
recall = recall_score(Y, Y_pred)
f1 = f1_score(Y, Y_pred)
Y_prob = clf.decision_function(X)
auc = roc_auc_score(Y, Y_prob)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC Score:', auc)
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
Accuracy: 0.8138768787317274 Precision: 0.820675105485232 Recall: 0.8023927392739274 F1 Score: 0.8114309553608678 AUC Score: 0.9050807516016706
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', verbose=2, n_jobs=-1)
grid_search.fit(X_train, Y_train)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
best_rf = RandomForestClassifier(
n_estimators=best_params['n_estimators'],
max_depth=best_params['max_depth'],
min_samples_split=best_params['min_samples_split'],
min_samples_leaf=best_params['min_samples_leaf']
)
best_rf.fit(X_train, Y_train)
Y_pred = best_rf.predict(X_test)
Y_prob = best_rf.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
auc = roc_auc_score(Y_test, Y_prob)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC Score:', auc)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:909: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). self.best_estimator_.fit(X, y, **fit_params)
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 150}
<ipython-input-237-07315e3d4f15>:26: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). best_rf.fit(X_train, Y_train)
Accuracy: 0.8578784757981462 Precision: 0.860655737704918 Recall: 0.8571428571428571 F1 Score: 0.8588957055214723 AUC Score: 0.931895710467139
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_rf.classes_)
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix')
plt.show()
tdf=pd.read_csv('Phishing_Legitimate_test_student (2).csv',index_col='id',na_values=['',' ','n/a','null'])
tdf.head()
| NumDots | SubdomainLevel | PathLevel | UrlLength | NumDash | NumDashInHostname | AtSymbol | TildeSymbol | NumUnderscore | NumPercent | ... | ExtFavicon | InsecureForms | RelativeFormAction | ExtFormAction | AbnormalFormAction | RightClickDisabled | PopUpWindow | IframeOrFrame | MissingTitle | ImagesOnlyInForm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| 1 | 6 | 1 | 2 | 59 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 2 | 1 | 3 | 76 | 4 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 3 | 3 | 1 | 1 | 59 | 0 | 0 | 0 | 0 | 3 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 5 | 1 | 3 | 67 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 2 | 0 | 4 | 88 | 3 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 37 columns
tdf=tdf[["InsecureForms","NumQueryComponents","NumDashInHostname","NoHttps","PctExtResourceUrls","NumDots","IpAddress"]]
tdf.head()
| InsecureForms | NumQueryComponents | NumDashInHostname | NoHttps | PctExtResourceUrls | NumDots | IpAddress | |
|---|---|---|---|---|---|---|---|
| id | |||||||
| 1 | 1 | 0 | 0 | 1 | 1.00000 | 6 | 0 |
| 2 | 1 | 0 | 0 | 1 | 1.00000 | 2 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0.87500 | 3 | 0 |
| 4 | 1 | 0 | 0 | 1 | 0.15625 | 5 | 0 |
| 5 | 1 | 0 | 0 | 1 | 0.00000 | 2 | 0 |
best_params = {
'n_estimators': 150,
'max_depth': 30,
'min_samples_split': 10,
'min_samples_leaf': 4
}
best_rf = RandomForestClassifier(
n_estimators=best_params['n_estimators'],
max_depth=best_params['max_depth'],
min_samples_split=best_params['min_samples_split'],
min_samples_leaf=best_params['min_samples_leaf']
)
best_rf.fit(X,Y)
<ipython-input-246-d34dca0acf94>:15: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). best_rf.fit(X,Y)
RandomForestClassifier(max_depth=30, min_samples_leaf=4, min_samples_split=10,
n_estimators=150)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(max_depth=30, min_samples_leaf=4, min_samples_split=10,
n_estimators=150)Y_pred = best_rf.predict(tdf)
print(Y_pred)
[1 1 0 ... 0 1 0]
data = {
"Prediction": Y_pred,
}
final = pd.DataFrame(data, index=[tdf.index])
final
| Prediction | |
|---|---|
| id | |
| 1 | 1 |
| 2 | 1 |
| 3 | 0 |
| 4 | 1 |
| 5 | 1 |
| ... | ... |
| 4996 | 0 |
| 4997 | 0 |
| 4998 | 0 |
| 4999 | 1 |
| 5000 | 0 |
5000 rows × 1 columns
final.to_csv("Predictions.csv")